#==============================================================================#
# 8-finish.R
# Matt Curtis  
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
#  updated feb 15 2023
#==============================================================================#

rm(list = ls())
gc()
#library(tidyverse)
#library(data.table)
library(arrow)
library(stringr)
library(tidytable)

#------------------------------------------------------------------------------#

# only set directory once
#setwd("~/Replication package")
genipath<-"~/Replication package/"

#------------------------------------------------------------------------------#
# apply the same string cleaning to each location
cleanloc <- function(loc) {
  loc <- loc %>%
    iconv(to = 'ASCII//TRANSLIT') %>% # remove accents
    str_to_lower() %>%
    # this shows up only in one file but I put it here anyway
    str_remove_all(coll("o arr.:")) %>%
    str_remove_all(coll("o prov.:")) %>%
    # remove everything but letters, - and _
    str_replace_all("[^A-za-z_\\- ]", " ") %>% 
    str_replace_all("\\[", " ") %>% 
    str_replace_all("\\]", " ") %>% 
    # then replace - with _
    # str_replace_all("\\-", "_") %>%
    str_squish() %>%
    replace(is.na(.), "")
  return(loc)
}

#------------------------------------------------------------------------------#
# load main data

file<-paste0(genipath,'2_scripts/2_0_tempfiles/fr-raw.parquet')

data<-read_parquet(file)  %>% 
  tidytable() %>% 
  distinct()

# check to make sure obs are not gained
check<-nrow(data)


data<-data %>%
  mutate(death_age_rough = death_start_year- birth_start_year) %>% 
  mutate(bad_death_age= death_age_rough<0|death_age_rough>120) %>% 
  mutate(death_age_rough=replace(
    death_age_rough,
    bad_death_age,
    NA
  ))


# nkids
# count number of kids per punion
nkids<-data %>% 
  mutate(missing_age = as.numeric(is.na(death_age_rough))) %>% 
  mutate(surv5 = as.numeric(death_age_rough<=5))%>% 
  summarize(d5 = sum(surv5),nkids = n(),nNA = sum(missing_age),
             .by=punion)%>% 
  mutate(across(c(d5,nkids,nNA),~replace(.,is.na(.),0)))

# link to munions then to profile id
nkids<-select(data,profile_id,munion1,munion2,munion3) %>% 
  pivot_longer(cols=c(munion1,munion2,munion3),
               values_to = "punion") %>% 
  filter(punion!="") %>% 
  select(-name) %>% 
  left_join(nkids) %>% 
  summarize(d5=sum(d5),nkids=sum(nkids),nNA=sum(nNA),
            .by=profile_id)

check<-nrow(data)
data<-left_join(data,nkids)          
stopifnot(check==nrow(data))


#------------------------------------------------------------------------------#
# clean location strings

data <- data %>%
  mutate(across(
    c('birth_city',
      'birth_county',
      'birth_state',
      'birth_place_name',
      'birth_country',
      'death_city',
      'death_county',
      'death_state',
      'death_place_name',
      'death_country',
    ),
    ~ cleanloc(.)
  ))


#------------------------------------------------------------------------------#
# load geo crosswalk

# geo
# geo
xwalk <- #fread('./2_scripts/2_0_tempfiles/fr-locations-original.csv') %>%
  read_parquet('./2_scripts/2_0_tempfiles/fr-locations-customs.parquet') %>%
  tidytable() %>% 
  mutate(place_name = place) %>% 
  select(-place) %>% 
  distinct() %>% 
  select(-locid) %>% distinct() %>%
  mutate(n = n(),.by=
           c(city,county,state,#country
             place_name)) %>% 
  filter(n==1) %>% 
  select(-n) %>%
  mutate(across(
    c(
      ends_with("place_name"),
      ends_with("city"),
      ends_with("county"),
      ends_with("state"),
      ends_with("country")
    ),
    ~ cleanloc(.)
  )) %>% 
  mutate(city_admin_flag = as.numeric(city_donated==1&is.na(city_level)))%>% 
  select(city,city_admin_flag,
         county,state,country,partible,excluded,subregion,
         city_distance,
         region,place_name,primo_ultimo,impartible_type,partible_type,
         custom=Custom1,droitecrit,#geonames_subregion,geonames_place,
         longitude,latitude,
         pop)



#------------------------------------------------------------------------------#
# link geo data

oldnames<-names(xwalk)

names(xwalk) <- paste0("birth_", oldnames)
data <- left_join(data, xwalk)
stopifnot(check == nrow(data))


names(xwalk) <- paste0("death_",  oldnames)
data <- left_join(data, xwalk)
stopifnot(check == nrow(data))

names(xwalk) <- paste0("marriage_",  oldnames)
data <- left_join(data, xwalk)
stopifnot(check == nrow(data))

#------------------------------------------------------------------------------#
# add new variables

data<-data %>%
  mutate(marriage_age_rough = marriage_start_year - birth_start_year) %>% 
  mutate(bad_marriage_age = marriage_age_rough<0|marriage_age_rough>120) %>% 
  mutate(marriage_age_rough=replace(
    marriage_age_rough,
    bad_marriage_age,
    NA
  ))


data<-data %>% 
  mutate(sample="France") %>% 
  mutate(has_spouse = munion1!="") %>% 
  mutate(celibate = 1-has_spouse) %>%
  #filter(birth_start_year>=1670&birth_start_year<=1793-40)%>%
  mutate(dfem = as.numeric(gender=="f"))  %>% 
  mutate(partible = replace(birth_partible,birth_partible=="",NA))  %>% 
  mutate(partible = as.numeric(partible=="Partible")) %>% 
  mutate(partible = replace(partible,birth_droitecrit==1&
                              is.na(partible),0))  %>% 
  mutate(region = birth_region) %>%
  mutate(decade = floor(birth_start_year/10)*10)%>% 
  mutate(urban = as.numeric(!is.na(birth_pop)&birth_pop!=0))%>% 
  
  mutate(excluded = case_when(
    str_detect(birth_excluded,"yes")~ 1,
    !str_detect(birth_excluded,"yes")~ 0
  )) %>%
  mutate(partible=replace(partible,birth_droitecrit==1&is.na(partible),0)) %>% 
  mutate(excluded=replace(excluded,birth_droitecrit==1&is.na(excluded),1)) %>% 
  mutate(log_distance_to_city = log((birth_city_distance/1000)))

# birth order
data<-data %>% 
  arrange(punion,birth_start_year) %>% 
  mutate(birth_order = row_number(),
         .by=punion) %>% 
  mutate(birth_order = replace(birth_order,punion=="",NA)) %>% 
  # gender specific birth order
  arrange(punion,gender,birth_start_year) %>% 
  mutate(birth_order_s = row_number(),
         .by=c(punion,gender)) %>% 
  mutate(birth_order_s = replace(birth_order,punion=="",NA))


# add firstborn location
first<-data %>% 
  filter(!is.na(birth_latitude))%>% 
  filter(punion!="") %>% 
  arrange(punion,birth_start_year) %>% 
  summarize(
    first_droitecrit = first(birth_droitecrit),
    first_impartible_type = first(birth_impartible_type),
    first_latitude = first(birth_latitude),
    first_longitude = first(birth_longitude),
    first_partible = first(partible),
    first_excluded = first(excluded),
    first_subregion = first(birth_subregion),
    #first_fid = first(birth_fid),
    first_region = first(birth_region),
    first_pop = first(birth_pop),
    first_urban = first(urban),
    first_city_distance = first(birth_city_distance),
    first_log_distance_to_city = first(log_distance_to_city),
    .by=punion)

check<-nrow(data)
data<-left_join(data,first)          
stopifnot(check==nrow(data))

# the final sample
sample<-data %>% 
  filter(birth_start_year>=1670&birth_start_year<=1850) %>% 
  filter(!is.na(birth_latitude)|!is.na(first_latitude)) %>% 
  filter(gender=="f"|gender=="m")


fwrite(data,'./3_outputs/3_1_datasets/fr-clean.csv')